library(tidyverse) library(glue) # similar to paste() but more powerful
library(tidyverse) library(glue) # similar to paste() but more powerful
Creating fake data is really helpful for creating a reproducible example (aka reprex)
npats <- 100
nvisits <- 10
create_visit_df <- function(visit_number) {
tibble(
patid = 1:npats,
visitdate = lubridate::as_date(sample(0:1e4, 100)),
blooddraw = rbinom(npats, 1, prob = .7),
bloodrawwho_nurse = blooddraw*rbinom(npats, 1, prob = .7),
bloodrawwho_other = blooddraw*(1-bloodrawwho_nurse)
) %>%
rename_at(vars(-patid), ~paste0(., "_", visit_number))
}
visit_wide <- map(1:nvisits, create_visit_df) %>%
reduce(inner_join, by = "patid")
## # A tibble: 1,000 x 5 ## patid visitdate blooddraw bloodrawwho_nurse bloodrawwho_other ## <int> <date> <int> <int> <dbl> ## 1 1 1980-01-19 0 0 0 ## 2 1 1987-12-02 0 0 0 ## 3 1 1992-09-22 0 0 0 ## 4 1 1993-11-20 1 1 0 ## 5 1 1985-02-24 1 1 0 ## 6 1 1981-02-25 0 0 0 ## 7 1 1989-09-01 0 0 0 ## 8 1 1996-11-26 1 1 0 ## 9 1 1983-08-20 1 1 0 ## 10 1 1983-09-09 0 0 0 ## # … with 990 more rows
In computer science, a list or sequence is an abstract data type that represents a countable number of ordered values, where the same value may occur more than once.
random_stuff <- list(mtcars,
qplot(mpg, cyl, data = mtcars),
function(x) x^2,
list("a", "b", "c"))
str(random_stuff, max.level = 1)
## List of 4 ## $ :'data.frame': 32 obs. of 11 variables: ## $ :List of 9 ## ..- attr(*, "class")= chr [1:2] "gg" "ggplot" ## $ :function (x) ## ..- attr(*, "srcref")= 'srcref' int [1:8] 4 22 4 36 22 36 4 4 ## .. ..- attr(*, "srcfile")=Classes 'srcfilecopy', 'srcfile' <environment: 0x629eba8> ## $ :List of 3
More complex objects in R are often just lists.
list(1:3, 4:10, seq(0, 20, by = 2))
## [[1]] ## [1] 1 2 3 ## ## [[2]] ## [1] 4 5 6 7 8 9 10 ## ## [[3]] ## [1] 0 2 4 6 8 10 12 14 16 18 20
Put the output of the element in a new list
numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2)) map(numeric_list, sum)
## [[1]] ## [1] 6 ## ## [[2]] ## [1] 49 ## ## [[3]] ## [1] 110
numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2)) L2norm <- function(x) sqrt(sum(x^2)) map(numeric_list, L2norm)
## [[1]] ## [1] 3.741657 ## ## [[2]] ## [1] 19.26136 ## ## [[3]] ## [1] 39.24283
Also called an anonymous function since it has no name
numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2)) map(numeric_list, function(x) sqrt(sum(x^2)))
## [[1]] ## [1] 3.741657 ## ## [[2]] ## [1] 19.26136 ## ## [[3]] ## [1] 39.24283
This shortcut only works in the tidyverse
numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2)) map(numeric_list, ~sqrt(sum(.^2)))
## [[1]] ## [1] 3.741657 ## ## [[2]] ## [1] 19.26136 ## ## [[3]] ## [1] 39.24283
Use a function that takes two arguments and returns one value to reduce a list.
+ is such a function
numeric_list <- list(1:3, 4:10, seq(0, 20, by = 2)) map(numeric_list, ~sqrt(sum(.^2))) %>% reduce(`+`)
## [1] 62.24585
Anything inside {} is evaluated as code
map(1:3, ~glue("_{.}")) %>%
reduce(c)
## [1] "_1" "_2" "_3"
map_chr returns a character vector.
There are many other variants of the map function.
map_chr(1:3, ~glue("_{.}"))
## [1] "_1" "_2" "_3"
We can easily select all variables for one visit
select(visit_wide, patid, ends_with("_1"))
## # A tibble: 100 x 5 ## patid visitdate_1 blooddraw_1 bloodrawwho_nurse_1 bloodrawwho_other_1 ## <int> <date> <int> <int> <dbl> ## 1 1 1980-01-19 0 0 0 ## 2 2 1974-05-20 0 0 0 ## 3 3 1974-04-05 1 1 0 ## 4 4 1977-03-26 1 1 0 ## 5 5 1977-02-19 1 1 0 ## 6 6 1970-01-21 0 0 0 ## 7 7 1976-04-11 1 1 0 ## 8 8 1981-08-23 1 0 1 ## 9 9 1987-01-31 1 1 0 ## 10 10 1988-03-30 1 0 1 ## # … with 90 more rows
df_list <- map(1:nvisits, ~select(visit_wide, patid, ends_with(glue("_{.}"))))
str(df_list, max.level = 1)
## List of 10 ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables: ## $ :Classes 'tbl_df', 'tbl' and 'data.frame': 100 obs. of 5 variables:
df_list[[1]]
## # A tibble: 100 x 5 ## patid visitdate_1 blooddraw_1 bloodrawwho_nurse_1 bloodrawwho_other_1 ## <int> <date> <int> <int> <dbl> ## 1 1 1980-01-19 0 0 0 ## 2 2 1974-05-20 0 0 0 ## 3 3 1974-04-05 1 1 0 ## 4 4 1977-03-26 1 1 0 ## 5 5 1977-02-19 1 1 0 ## 6 6 1970-01-21 0 0 0 ## 7 7 1976-04-11 1 1 0 ## 8 8 1981-08-23 1 0 1 ## 9 9 1987-01-31 1 1 0 ## 10 10 1988-03-30 1 0 1 ## # … with 90 more rows
df_list[[2]]
## # A tibble: 100 x 5 ## patid visitdate_2 blooddraw_2 bloodrawwho_nurse_2 bloodrawwho_other_2 ## <int> <date> <int> <int> <dbl> ## 1 1 1987-12-02 0 0 0 ## 2 2 1996-10-09 1 1 0 ## 3 3 1994-12-10 0 0 0 ## 4 4 1977-01-15 0 0 0 ## 5 5 1986-03-12 1 1 0 ## 6 6 1979-05-03 1 1 0 ## 7 7 1973-11-08 0 0 0 ## 8 8 1976-05-09 1 1 0 ## 9 9 1974-08-21 1 1 0 ## 10 10 1989-08-11 1 1 0 ## # … with 90 more rows
df_list2 <- map(df_list, ~rename_all(., ~str_remove(., "_[:digit:]+$"))) map(df_list2, names)
## [[1]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[2]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[3]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[4]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[5]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[6]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[7]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[8]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[9]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other" ## ## [[10]] ## [1] "patid" "visitdate" "blooddraw" ## [4] "bloodrawwho_nurse" "bloodrawwho_other"
bind_rows takes two dataframes and rowbinds them reducing the list with bindrows will iteratively apply bind_rows so we end up with one dataframe
reduce(df_list2, bind_rows)
## # A tibble: 1,000 x 5 ## patid visitdate blooddraw bloodrawwho_nurse bloodrawwho_other ## <int> <date> <int> <int> <dbl> ## 1 1 1980-01-19 0 0 0 ## 2 2 1974-05-20 0 0 0 ## 3 3 1974-04-05 1 1 0 ## 4 4 1977-03-26 1 1 0 ## 5 5 1977-02-19 1 1 0 ## 6 6 1970-01-21 0 0 0 ## 7 7 1976-04-11 1 1 0 ## 8 8 1981-08-23 1 0 1 ## 9 9 1987-01-31 1 1 0 ## 10 10 1988-03-30 1 0 1 ## # … with 990 more rows
map(1:nvisits, ~select(visit_wide, patid, ends_with(glue("_{.}")))) %>%
map(~rename_all(., ~str_remove(., "_.$"))) %>%
reduce(bind_rows)
## # A tibble: 1,000 x 9 ## patid visitdate blooddraw bloodrawwho_nur… bloodrawwho_oth… ## <int> <date> <int> <int> <dbl> ## 1 1 1980-01-19 0 0 0 ## 2 2 1974-05-20 0 0 0 ## 3 3 1974-04-05 1 1 0 ## 4 4 1977-03-26 1 1 0 ## 5 5 1977-02-19 1 1 0 ## 6 6 1970-01-21 0 0 0 ## 7 7 1976-04-11 1 1 0 ## 8 8 1981-08-23 1 0 1 ## 9 9 1987-01-31 1 1 0 ## 10 10 1988-03-30 1 0 1 ## # … with 990 more rows, and 4 more variables: visitdate_10 <date>, ## # blooddraw_10 <int>, bloodrawwho_nurse_10 <int>, ## # bloodrawwho_other_10 <dbl>
map(1:nvisits, ~select(visit_wide, patid, ends_with(glue("_{.}")))) %>%
map_dfr(~rename_all(., ~str_remove(., "_[:digit:]+$")))
## # A tibble: 1,000 x 5 ## patid visitdate blooddraw bloodrawwho_nurse bloodrawwho_other ## <int> <date> <int> <int> <dbl> ## 1 1 1980-01-19 0 0 0 ## 2 2 1974-05-20 0 0 0 ## 3 3 1974-04-05 1 1 0 ## 4 4 1977-03-26 1 1 0 ## 5 5 1977-02-19 1 1 0 ## 6 6 1970-01-21 0 0 0 ## 7 7 1976-04-11 1 1 0 ## 8 8 1981-08-23 1 0 1 ## 9 9 1987-01-31 1 1 0 ## 10 10 1988-03-30 1 0 1 ## # … with 990 more rows
Create a SQLite database in memory and load our visit data into it
library(DBI) con <- dbConnect(RSQLite::SQLite(), ":memory:") dbWriteTable(con, "visit_wide", visit_wide) visit_db <- tbl(con, "visit_wide")
We just change bind_rows to union_all
map(1:nvisits, ~select(visit_db, patid, ends_with(glue("_{.}")))) %>%
map(~rename_all(., ~str_remove(., "_[:digit:]+$"))) %>%
reduce(union_all)
## # Source: lazy query [?? x 5] ## # Database: sqlite 3.22.0 [:memory:] ## patid visitdate blooddraw bloodrawwho_nurse bloodrawwho_other ## <int> <dbl> <int> <int> <dbl> ## 1 1 3670 0 0 0 ## 2 2 1600 0 0 0 ## 3 3 1555 1 1 0 ## 4 4 2641 1 1 0 ## 5 5 2606 1 1 0 ## 6 6 20 0 0 0 ## 7 7 2292 1 1 0 ## 8 8 4252 1 0 1 ## 9 9 6239 1 1 0 ## 10 10 6663 1 0 1 ## # … with more rows
map(1:nvisits, ~select(visit_db, patid, ends_with(glue("_{.}")))) %>%
map(~rename_all(., ~str_remove(., "_[:digit:]+$"))) %>%
reduce(union_all) %>%
show_query()
## <SQL> ## SELECT `patid`, `visitdate_1` AS `visitdate`, `blooddraw_1` AS `blooddraw`, `bloodrawwho_nurse_1` AS `bloodrawwho_nurse`, `bloodrawwho_other_1` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_2` AS `visitdate`, `blooddraw_2` AS `blooddraw`, `bloodrawwho_nurse_2` AS `bloodrawwho_nurse`, `bloodrawwho_other_2` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_3` AS `visitdate`, `blooddraw_3` AS `blooddraw`, `bloodrawwho_nurse_3` AS `bloodrawwho_nurse`, `bloodrawwho_other_3` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_4` AS `visitdate`, `blooddraw_4` AS `blooddraw`, `bloodrawwho_nurse_4` AS `bloodrawwho_nurse`, `bloodrawwho_other_4` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_5` AS `visitdate`, `blooddraw_5` AS `blooddraw`, `bloodrawwho_nurse_5` AS `bloodrawwho_nurse`, `bloodrawwho_other_5` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_6` AS `visitdate`, `blooddraw_6` AS `blooddraw`, `bloodrawwho_nurse_6` AS `bloodrawwho_nurse`, `bloodrawwho_other_6` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_7` AS `visitdate`, `blooddraw_7` AS `blooddraw`, `bloodrawwho_nurse_7` AS `bloodrawwho_nurse`, `bloodrawwho_other_7` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_8` AS `visitdate`, `blooddraw_8` AS `blooddraw`, `bloodrawwho_nurse_8` AS `bloodrawwho_nurse`, `bloodrawwho_other_8` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_9` AS `visitdate`, `blooddraw_9` AS `blooddraw`, `bloodrawwho_nurse_9` AS `bloodrawwho_nurse`, `bloodrawwho_other_9` AS `bloodrawwho_other` ## FROM `visit_wide` ## UNION ALL ## SELECT `patid`, `visitdate_10` AS `visitdate`, `blooddraw_10` AS `blooddraw`, `bloodrawwho_nurse_10` AS `bloodrawwho_nurse`, `bloodrawwho_other_10` AS `bloodrawwho_other` ## FROM `visit_wide`
The map and reduce functions allow us to program on a higher level of abstraction.
The scoped variants of dplyr work much the same way.
mutate_if(); mutate_at(); mutate_all() select_if(); select_at(); select_all() filter_if(); filter_at(); filter_all() rename_if(); rename_at(); rename_all() summarise_if(); summarise_at(); summarise_all()
These functions map over a dataframe which is a list of columns.
is.list(mtcars)
## [1] TRUE